Client Report - To Infinity and Beyond…wait wrong movie

Unit 5 Stretch

Author

Ezekial Curran

Background

Survey data is notoriously difficult to munge. Even when the data is recorded cleanly the options for ‘write in questions’, ‘choose from multiple answers’, ‘pick all that are right’, and ‘multiple choice questions’ makes storing the data in a tidy format difficult.

In 2014, FiveThirtyEight surveyed over 1000 people to write the article titled, America’s Favorite ‘Star Wars’ Movies (And Least Favorite Characters). They have provided the data on GitHub.

For this project, the goal is to validate the data provided on GitHub lines up with the articel be recreating at least two of the visuals and predict if a person from the survey makes at least $50k.

Imports and Functions

Show the code
import random
import polars as pl
import numpy as np
import xgboost as xgb
from lets_plot import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import (
  classification_report, 
  accuracy_score, 
  recall_score, 
  precision_score, 
  f1_score
  )
# add the additional libraries you need to import for ML here

LetsPlot.setup_html(isolated_frame=True)
Show the code
# import your data here using pandas and the URL
url = "https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv"
df = pl.read_csv("StarWars.csv")

df_clean = df.rename({
    df.columns[1]: "seen",
    df.columns[2]: "fan",
    **{df.columns[i]: f"seen_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i - 3]}" for i in range(3, 9)},
    **{df.columns[i]: f"rank_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i - 9]}" for i in range(9, 15)},
    **{df.columns[i]: df[df.columns[i]][0] for i in range(15, 29)},
    df.columns[29]: "shot_first",
    df.columns[30]: "ex_uni",
    df.columns[31]: "fan_ex_uni",
    df.columns[32]: "fan_star_trek",
    **{df.columns[i]: df.columns[i].lower().replace(' ', '_') for i in range(33, 37)},
    df.columns[37]: "location"
})

df_clean = df_clean[1:]
Show the code
# Helper functions and shared lists
seen_dict = {"seen_epi_i_yes": 1, "seen_epi_ii_yes": 2, "seen_epi_iii_yes": 3, "seen_epi_iv_yes": 4, "seen_epi_v_yes": 5, "seen_epi_vi_yes": 6}
rank_dict = {"rank_epi_i": 1, "rank_epi_ii": 2, "rank_epi_iii": 3, "rank_epi_iv": 4, "rank_epi_v": 5, "rank_epi_vi": 6,}
favor_cols = [df[df.columns[i]][0] for i in range(15, 29)]
shot_cols = ["shot_first_Greedo", "shot_first_Han", "shot_first_I don't understand this question"]
movie_dict = {1: "The Phantom Menace", 2: "Attack of the Clones", 3: "Revenge of the Sith", 4: "A New Hope", 5: "The Empire Strikes Back", 6: "Return of the Jedi"}

def GetSeen(data):
  data_seen = data.filter(
    pl.any_horizontal([pl.col(i) > 0 for i in seen_dict.keys()])
  ).select(seen_dict.keys())

  seen_percs = data_seen.unpivot(variable_name="movie", value_name="percentage").group_by("movie").agg(pl.col('percentage').sum() / data_seen.height)
  seen_percs = seen_percs.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  # Specifies the order
  seen_percs = seen_percs.with_columns(
    pl.col('movie').replace_strict(seen_dict, default=0).alias('movie_order')
  ).with_columns(
    pl.col('movie_order').replace_strict(movie_dict, default="0").alias('movie')
  ).sort('movie_order', descending=True)

  return seen_percs

def GetRanks(data):
  data_rank = data.filter(
    pl.all_horizontal([pl.col(i) > 0 for i in seen_dict.keys()])
  ).select(rank_dict.keys())

  # Episode 3 has 1 missing rank (when compared to the other movies the rank option left is 6)
  data_rank = data_rank.with_columns(pl.col('rank_epi_iii').replace(0, 6))

  rank_percs = data_rank.unpivot(variable_name="movie", value_name="percentage").group_by("movie").agg(pl.col('percentage').filter(pl.col('percentage') == 1).sum() / data_rank.height)
  rank_percs = rank_percs.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  # Specifies the order
  rank_percs = rank_percs.with_columns(
    pl.col('movie').replace_strict(rank_dict, default=0).alias('movie_order')
  ).with_columns(
    pl.col('movie_order').replace_strict(movie_dict, default="0").alias('movie')
  ).sort('movie_order', descending=True)

  return rank_percs

def GetRatings(data):
  data_ratings = data.filter(
    pl.all_horizontal([pl.col(i) > 0 for i in seen_dict.keys()])
  ).select(rank_dict.keys())

  # data_ratings = data_ratings.with_columns(pl.col('rank_epi_iii').replace("0", "6"))
  data_ratings = data_ratings.with_columns(pl.col('rank_epi_iii').replace(0, 6))

  thirds = data_ratings.with_columns(
    pl.col(i).replace_strict({
      1: 1,
      2: 1,
      3: 2,
      4: 2,
      5: 3,
      6: 3
    }, default=0).alias(i) for i in data_ratings.columns
  )

  third_counts = thirds.unpivot(variable_name="movie").group_by(['movie', 'value']).len().pivot('value', index='movie')

  third_percs = third_counts.with_columns(
    [(pl.col(third_counts.columns[i]) / thirds.height).alias(third_counts.columns[i]) for i in range(1, 4)]
  )

  third_percs = third_percs.with_columns(pl.col('movie').replace_strict(rank_dict, default=0).alias('movie_order')).with_columns(pl.col('movie_order').replace_strict(movie_dict, default="0").alias('movie')).sort('movie_order', descending=True).select(['movie', '1', '2', '3'])

  third_percs = third_percs.rename({"1": 'Top third', "2": 'Middle third', "3": 'Bottom third'})

  third_long = third_percs.unpivot(index='movie', variable_name='rating', value_name='percentage')

  third_long = third_long.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  return third_long

def GetFavorability(data):
  data_favors = data.filter(
    pl.any_horizontal([pl.col(i) > 0 for i in seen_dict.keys()])
  ).select(favor_cols)

  fourth = data_favors.with_columns(
    pl.col(i).replace_strict({
        0: 1,
        1: 1,
        2: 2,
        3: 2,
        4: 3,
        5: 4,
        6: 4
    }, default=0).alias(i) for i in data_favors.columns
  )

  fourth_counts = fourth.unpivot(variable_name="character").group_by(['character', 'value']).len().pivot('value', index='character').sort('4')

  fourth_percs = fourth_counts.with_columns(
    [(pl.col(fourth_counts.columns[i]) / fourth.height).alias(fourth_counts.columns[i]) for i in range(1, 5)]
  )

  fourth_percs = fourth_percs.rename({"1": 'Unfamiliar', "2": 'Unfavorable', "3": 'Neutral', "4": "Favorable"})
  
  fourth_percs = fourth_percs.select(['character', 'Favorable', 'Neutral', 'Unfavorable', 'Unfamiliar'])

  fourth_long = fourth_percs.unpivot(index='character', variable_name='favor', value_name='percentage')

  fourth_long = fourth_long.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  return fourth_long

def GetShot(data):
  data_shot = data.filter(
    pl.any_horizontal([pl.col(i) > 0 for i in shot_cols])
  ).select(shot_cols)

  shot_percs = data_shot.unpivot(variable_name="who", value_name="percentage").group_by("who").agg(
      pl.col('percentage').sum() / data_shot.height
    ).with_columns(
      pl.col("who").replace({
        "shot_first_Han": "Han",
        "shot_first_Greedo": "Greedo",
        "shot_first_I don't understand this question": "I don't understand\nthis question"
      })
    )

  shot_percs = shot_percs.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  # Designed to specify the order
  shot_percs = shot_percs.with_columns(
    pl.col('who').replace_strict({
      "Han": 2,
      "Greedo": 1
    }, default=0).alias('who_order')
  ).sort('who_order')


  return shot_percs

def FindMdls(X_train, X_test, y_train, y_test):
  param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'subsample': [0.75, 0.8, 0.85, 0.9, 1.0]
  }
  gb = GradientBoostingClassifier(random_state=795)

  grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

  grid_search.fit(X_train, y_train)

  print(f"Best Parameters: ", grid_search.best_params_)
  print(f"Best Score: ", grid_search.best_score_)

  # grid_search does implement predict
  predictions = grid_search.predict(X_test)
  print(classification_report(y_test, predictions))

def FindXGBMdls(X_features, y_targets):
  param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'subsample': [0.75, 0.8, 0.85, 0.9, 1.0]
  }

  rnd = 795

  gb = GradientBoostingClassifier(
    random_state=rnd,
    n_estimators=50
  )

  # grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

  # grid_search.fit(X_train, y_train)
  # print(f"Best Parameters: ", grid_search.best_params_)
  # print(f"Best Score: ", grid_search.best_score_)

  # predictions = grid_search.predict(X_test)
  # print(classification_report(y_test, predictions))
  X_train, X_test, y_train, y_test = train_test_split(X_features, y_targets, random_state=rnd, test_size=0.2)
  gb.fit(X_train, y_train)
  preds = gb.predict(X_test)
  acc = accuracy_score(y_test, preds)
  # f1 = f1_score(y_test, preds)
  print(classification_report(y_test, preds))

  best_mdl = gb
  best_rnd = rnd
  best_acc = acc
  # best_f1 = f1

  print(f"Current random number: {rnd} with accuracy of: {acc}")

  randos = np.random.randint(1, 1000, size=100)
  for i in randos:
    # rnd = random.randint(rand_min, rand_max)
    rnd_mdl = GradientBoostingClassifier(
      random_state=i,
      n_estimators=50
    )

    X_train, X_test, y_train, y_test = train_test_split(X_features, y_targets, random_state=i, test_size=0.2)

    rnd_mdl.fit(X_train, y_train)
    rnd_pred = rnd_mdl.predict(X_test)
    rnd_acc = accuracy_score(y_test, rnd_pred)
    # rnd_f1 = f1_score(y_test, rnd_pred)

    if (rnd_acc > best_acc):
      best_mdl = rnd_mdl
      best_acc = rnd_acc
      # best_f1 = rnd_f1
      best_rnd = i

    print(f"Current random number: {i} with accuracy of: {rnd_acc}")

  print(f"Best random number: {best_rnd} with accuracy of: {best_acc}")
Show the code
# Include and execute your code here
df_clean = df_clean.with_columns(
  pl.col('household_income').replace({
    "$150,000+": 2,
    "$100,000 - $149,999": 2,
    "$50,000 - $99,999": 2,
    "$25,000 - $49,999": 1,
    "$0 - $24,999": 1
  }, default=0).alias("income")
)
df_clean = df_clean.drop(['household_income'])

df_all = df_clean

df_clean = df_clean.with_columns(
  pl.col("age").replace_strict({
    "> 60": 4,
    "45-60": 3,
    "30-44": 2,
    "18-29": 1,
  }, default=0).alias("age")
)

df_clean = df_clean.with_columns(
  pl.col(df_clean.columns[i]).replace({
    "Very favorably": 6,
    "Somewhat favorably": 5,
    "Neither favorably nor unfavorably (neutral)": 4,
    "Somewhat unfavorably": 3,
    "Very unfavorably": 2,
    "Unfamiliar (N/A)": 1
  }, default=0).alias(df_clean.columns[i]) for i in range(15, 29)
)

df_clean = df_clean.with_columns(
  pl.when((pl.col(df_clean.columns[i]).is_null()))
  .then(pl.lit("no"))
  .otherwise(pl.lit("yes"))
  .alias(df_clean.columns[i]) for i in range(3, 9)
)

df_clean = df_clean.with_columns(
  pl.col(i).fill_null(0) for i in rank_dict.keys()
)

df_clean = df_clean.cast({'rank_epi_i': pl.Int64, 'rank_epi_ii': pl.Int64, 'rank_epi_iii': pl.Int64, 'rank_epi_iv': pl.Int64, 'rank_epi_v': pl.Int64, 'rank_epi_vi': pl.Int64})


df_clean = df_clean.with_columns(
  pl.col('education').replace_strict({
    "Less than high school degree": 1.0,
    "High school degree": 1.5,
    "Some college or Associate degree": 2.0,
    "Bachelor degree": 4.0,
    "Graduate degree": 7.0
  }, default=0.0).alias('education')
)

cat_cols = ['seen', 'fan', 'seen_epi_i', 'seen_epi_ii', 'seen_epi_iii', 'seen_epi_iv', 'seen_epi_v', 'seen_epi_vi', 'shot_first', 'ex_uni', 'fan_ex_uni', 'fan_star_trek', 'gender', 'location']

all_code = df_all.columns[1:37]

enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
enc_ar = enc.fit_transform(df_clean[cat_cols])
enc_cols = list(enc.get_feature_names_out(cat_cols))
enc_df = pl.DataFrame(enc_ar, schema=enc_cols)
df_tot = pl.concat([df_clean.drop(cat_cols), enc_df], how='horizontal')

enc_all = enc.fit_transform(df_all[all_code])
enc_all_cols = list(enc.get_feature_names_out(all_code))
enc_all_df = pl.DataFrame(enc_all, schema=enc_all_cols)

X = df_tot.drop(['RespondentID', 'income'])
y = df_tot.select('income')

X_all = enc_all_df
y_all = df_all.select('income')

print(f"Here is a data preview.")
display(df_clean.head(7))
Here is a data preview.
shape: (7, 38)
RespondentID seen fan seen_epi_i seen_epi_ii seen_epi_iii seen_epi_iv seen_epi_v seen_epi_vi rank_epi_i rank_epi_ii rank_epi_iii rank_epi_iv rank_epi_v rank_epi_vi Han Solo Luke Skywalker Princess Leia Organa Anakin Skywalker Obi Wan Kenobi Emperor Palpatine Darth Vader Lando Calrissian Boba Fett C-3P0 R2 D2 Jar Jar Binks Padme Amidala Yoda shot_first ex_uni fan_ex_uni fan_star_trek gender age education location income
i64 str str str str str str str str i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 str str str str str i64 f64 str i64
3292879998 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 3 2 1 4 5 6 6 6 6 6 6 6 6 1 1 6 6 6 6 6 "I don't understand this questi… "Yes" "No" "No" "Male" 1 1.5 "South Atlantic" 0
3292879538 "No" null "no" "no" "no" "no" "no" "no" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 null null null "Yes" "Male" 1 4.0 "West South Central" 1
3292765271 "Yes" "No" "yes" "yes" "yes" "no" "no" "no" 1 2 3 4 5 6 5 5 5 5 5 1 1 1 1 1 1 1 1 1 "I don't understand this questi… "No" null "No" "Male" 1 1.5 "West North Central" 1
3292763116 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 5 6 1 2 4 3 6 6 6 6 6 5 6 5 3 6 6 6 6 6 "I don't understand this questi… "No" null "Yes" "Male" 1 2.0 "West North Central" 2
3292731220 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 5 4 6 2 1 3 6 5 5 3 6 2 5 4 6 5 5 2 5 5 "Greedo" "Yes" "No" "No" "Male" 1 2.0 "West North Central" 2
3292719380 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 1 4 3 6 5 2 6 6 6 6 6 4 6 4 5 5 5 5 4 6 "Han" "Yes" "No" "Yes" "Male" 1 4.0 "Middle Atlantic" 1
3292684787 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 6 5 4 3 1 2 6 6 5 5 6 6 6 6 6 5 6 3 5 6 "Han" "Yes" "No" "No" "Male" 1 1.5 "East North Central" 0

Question 1

  1. Build a machine learning model that predicts whether a person makes at least $50k with accuracy of at least 65%. Describe your model and report the accuracy.

The gradient boosting classifier model was used on the data. Limiting the n_estimators to 50 gave better results than increasing it, the accuracy of the model acheived 69%. Which isn’t all that good, but the data has a lot of missing values and the dataset of non missing values isn’t all that large.

Show the code
# Include and execute your code here
# 693 sucks
# 798, 343, 293, 841, 795
rnd = 795
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, random_state=rnd, test_size=0.2)

model = GradientBoostingClassifier(random_state=rnd, n_estimators=50)
model.fit(X_trn, y_trn)
model = model.predict(X_tst)
print(classification_report(y_tst, model))
              precision    recall  f1-score   support

           0       0.93      0.41      0.57        63
           1       0.48      0.61      0.54        46
           2       0.72      0.85      0.78       129

    accuracy                           0.69       238
   macro avg       0.71      0.62      0.63       238
weighted avg       0.73      0.69      0.68       238
Show the code
# FindMdls(X_trn, X_tst, y_trn, y_tst)

Question 2

  1. Validate that the data provided on GitHub lines up with the article by recreating a 3rd visual from the article.

Using the data provided on GitHub, all 5 graphs from the article can be recreated. However, there are a few points of rounding error from some of the graphs but they ultimately refelct the results from the article.

Show the code
# Five charts:
# Which 'Star Wars' Movies Have You Seen?
df_seen = GetSeen(df_tot)
movie_graph = (
    ggplot(data=df_seen)
    + geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
    + labs(
      title="Which 'Star Wars' Movie's Have You Seen?",
      subtitle="Of 835 respondents who have seen any film",
      x='',
      y=''
    )
    + scale_x_continuous(limits=[0,1])
    + geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.075, size=12, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        # panel_grid_major=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=18),
        # axis_title=element_text(color='white'),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot'
    )
    + ggsize(1600, 900)
)

display(movie_graph)
# What's the Best 'Star Wars' Movie?
df_ranks = GetRanks(df_tot)
rank_graph = (
    ggplot(data=df_ranks)
    + geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
    + labs(
      title="What's the Best 'Star Wars' Movie?",
      subtitle="Of 471 respondents who have seen all 6 films",
      x='',
      y=''
    )
    + scale_x_continuous(limits=[0,0.4])
    + geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.035, size=12, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=18),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot'
    )
    + ggsize(1600, 900)
)

display(rank_graph)
# How People Rate the 'Star Wars' Movies
df_ratings = GetRatings(df_tot)
rate_graph = (
    ggplot(data=df_ratings)
    + geom_bar(mapping=aes(x='percentage', y='movie', color='rating', fill='rating'), stat='identity', orientation='y', position='dodgev')
    + facet_wrap(facets='rating', ncol=3, order=-1)
    + guides(color='none', fill='none')
    + labs(
      title="How People Rate the 'Star Wars' Movies",
      subtitle="How often each film was rated in the top, middle, and bottom third\n(by 471 respondents who have seen all six films)",
      x='',
      y=''
    )
    + scale_x_continuous(limits=[0,1])
    + geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.25, size=12, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=18),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot',
        text=element_text(color='black'),
        strip_text=element_text(face='bold', size=20)
    )
    + ggsize(1800, 900)
)

display(rate_graph)
# 'Star Wars' Character Favorability Ratings
df_favors = GetFavorability(df_tot)
favor_graph = (
    ggplot(data=df_favors)
    + geom_bar(mapping=aes(x='percentage', y='character', color='favor', fill='favor'), stat='identity', orientation='y', position='dodgev')
    + facet_wrap(facets='favor', ncol=4, order=0)
    + guides(color='none', fill='none')
    + labs(
      title="'Star Wars' Character Favorability Ratings",
      subtitle="By 835 respondants",
      x='',
      y=''
    )
    + scale_x_continuous(limits=[0,1.3])
    + geom_text(aes(x='percentage', y='character', label='perc_label'), nudge_x=0.25, size=8, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=18),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot',
        text=element_text(color='black'),
        strip_text=element_text(face='bold', size=20)
    )
    + ggsize(1800, 1500)
)

display(favor_graph)
# Who Shot First?
df_shot = GetShot(df_tot)
shot_graph = (
    ggplot(data=df_shot)
    + geom_bar(mapping=aes(x='percentage', y='who'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
    + labs(
      title="Who Shot First?",
      subtitle="According to 828 respondents",
      x='',
      y=''
    )
    + scale_x_continuous(limits=[0,0.45])
    + geom_text(aes(x='percentage', y='who', label='perc_label'), nudge_x=0.05, size=12, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        # panel_grid_major=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=20),
        # axis_title=element_text(color='white'),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot'
    )
    + ggsize(1600, 900)
)

display(shot_graph)

Question 3

  1. Create a new column that converts the location groupings to a single number (a.k.a. label encoding). Drop the location categorical column.

Changing the encoder for location from One Hot encoding to Label Encoding puts all the locations as a single column with integers representing each location. Using Label Encoding could imply ordinality in the locations which isn’t quite true, and training a new model with everything else the same except the location does hurt the models performance.

Show the code
alt_cols = ['seen', 'fan', 'seen_epi_i', 'seen_epi_ii', 'seen_epi_iii', 'seen_epi_iv', 'seen_epi_v', 'seen_epi_vi', 'shot_first', 'ex_uni', 'fan_ex_uni', 'fan_star_trek', 'gender', 'education']

enc_lab = LabelEncoder()
enc_lab_ar = enc_lab.fit_transform(df_clean['location'])
df_alt = df_clean.with_columns(
  location=pl.Series(enc_lab_ar)
)

enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
enc_alt = enc.fit_transform(df_alt[alt_cols])
enc_alt_cols = list(enc.get_feature_names_out(alt_cols))
enc_alt_df = pl.DataFrame(enc_alt, schema=enc_alt_cols)
df_alt_tot = pl.concat([df_alt.drop(alt_cols), enc_alt_df], how='horizontal')


X_alt = df_alt_tot.drop(['RespondentID', 'income'])
y_alt = df_alt_tot.select('income')

df_alt.head(7)
shape: (7, 38)
RespondentID seen fan seen_epi_i seen_epi_ii seen_epi_iii seen_epi_iv seen_epi_v seen_epi_vi rank_epi_i rank_epi_ii rank_epi_iii rank_epi_iv rank_epi_v rank_epi_vi Han Solo Luke Skywalker Princess Leia Organa Anakin Skywalker Obi Wan Kenobi Emperor Palpatine Darth Vader Lando Calrissian Boba Fett C-3P0 R2 D2 Jar Jar Binks Padme Amidala Yoda shot_first ex_uni fan_ex_uni fan_star_trek gender age education location income
i64 str str str str str str str str i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 str str str str str i64 f64 i64 i64
3292879998 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 3 2 1 4 5 6 6 6 6 6 6 6 6 1 1 6 6 6 6 6 "I don't understand this questi… "Yes" "No" "No" "Male" 1 1.5 6 0
3292879538 "No" null "no" "no" "no" "no" "no" "no" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 null null null "Yes" "Male" 1 4.0 8 1
3292765271 "Yes" "No" "yes" "yes" "yes" "no" "no" "no" 1 2 3 4 5 6 5 5 5 5 5 1 1 1 1 1 1 1 1 1 "I don't understand this questi… "No" null "No" "Male" 1 1.5 7 1
3292763116 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 5 6 1 2 4 3 6 6 6 6 6 5 6 5 3 6 6 6 6 6 "I don't understand this questi… "No" null "Yes" "Male" 1 2.0 7 2
3292731220 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 5 4 6 2 1 3 6 5 5 3 6 2 5 4 6 5 5 2 5 5 "Greedo" "Yes" "No" "No" "Male" 1 2.0 7 2
3292719380 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 1 4 3 6 5 2 6 6 6 6 6 4 6 4 5 5 5 5 4 6 "Han" "Yes" "No" "Yes" "Male" 1 4.0 2 1
3292684787 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" 6 5 4 3 1 2 6 6 5 5 6 6 6 6 6 5 6 3 5 6 "Han" "Yes" "No" "No" "Male" 1 1.5 0 0
Show the code
X_trn_alt, X_tst_alt, y_trn_alt, y_tst_alt = train_test_split(X_alt, y_alt, random_state=rnd, test_size=0.2)

model_alt = GradientBoostingClassifier(random_state=rnd, n_estimators=50)
model_alt.fit(X_trn_alt, y_trn_alt)
pred_alt = model_alt.predict(X_tst_alt)
print(classification_report(y_tst_alt, pred_alt))
              precision    recall  f1-score   support

           0       0.93      0.41      0.57        63
           1       0.48      0.57      0.52        46
           2       0.71      0.85      0.77       129

    accuracy                           0.68       238
   macro avg       0.71      0.61      0.62       238
weighted avg       0.72      0.68      0.67       238